This notebook is Tom’s analysis from raw data
source("../CamProt_R/Utility.R")
library(tidyverse)
[30m── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mtibble [30m 2.0.0 [32m✔[30m [34mpurrr [30m 0.2.5
[32m✔[30m [34mreadr [30m 1.3.1 [32m✔[30m [34mstringr[30m 1.3.1
[32m✔[30m [34mtibble [30m 2.0.0 [32m✔[30m [34mforcats[30m 0.3.0[39m
[30m── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mMSnbase[30m::[32mcombine()[30m masks [34mBiobase[30m::combine(), [34mBiocGenerics[30m::combine(), [34mdplyr[30m::combine()
[31m✖[30m [34mS4Vectors[30m::[32mexpand()[30m masks [34mtidyr[30m::expand()
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mS4Vectors[30m::[32mfirst()[30m masks [34mdplyr[30m::first()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31m✖[30m [34mBiocGenerics[30m::[32mPosition()[30m masks [34mggplot2[30m::Position(), [34mbase[30m::Position()
[31m✖[30m [34mpurrr[30m::[32mreduce()[30m masks [34mMSnbase[30m::reduce()
[31m✖[30m [34mS4Vectors[30m::[32mrename()[30m masks [34mdplyr[30m::rename()[39m
infile <- "Input/OOPS_qLOPIT_LabelFree_PeptideGroups_parsed.txt"
samples_inf <- "Input/samples.tsv"
peptides_df <- parse_features(infile, silac=FALSE, TMT=FALSE, level="peptide",
filter_crap=TRUE, protein_col='Master.Protein.Accessions')
Tally of features at each stage:
29067 All features
These features are associated with 2731 master proteins
27882 Excluding features without a master protein
These features are associated with 2730 master proteins
26302 Excluding features without a unique master protein
These features are associated with 2405 master proteins
26139 Excluding features matching a cRAP protein
These features are associated with 2396 master proteins
Identified an additional 0 proteins as 'cRAP associated'
colnames(peptides_df)
[1] "Checked" "Confidence" "Sequence"
[4] "Modifications" "Qvality.PEP" "Qvality.q.value"
[7] "Number.of.Protein.Groups" "Number.of.Proteins" "Number.of.PSMs"
[10] "Master.Protein.Accessions" "Number.of.Missed.Cleavages" "Theo.MHplus.in.Da"
[13] "Found.in.Sample.in.S1.F1.Sample" "Found.in.Sample.in.S2.F2.Sample" "Found.in.Sample.in.S3.F3.Sample"
[16] "Found.in.Sample.in.S4.F4.Sample" "Found.in.Sample.in.S6.F6.Sample" "Found.in.Sample.in.S5.F5.Sample"
[19] "Found.in.Sample.in.S7.F7.Sample" "Found.in.Sample.in.S8.F8.Sample" "Found.in.Sample.in.S9.F9.Sample"
[22] "Found.in.Sample.in.S10.F10.Sample" "Found.in.Sample.in.S11.F11.Sample" "Found.in.Sample.in.S12.F12.Sample"
[25] "Found.in.Sample.in.S13.F13.Sample" "Found.in.Sample.in.S14.F14.Sample" "Found.in.Sample.in.S15.F15.Sample"
[28] "Found.in.Sample.in.S16.F16.Sample" "Found.in.Sample.in.S17.F17.Sample" "Found.in.Sample.in.S18.F18.Sample"
[31] "Found.in.Sample.in.S19.F19.Sample" "Found.in.Sample.in.S20.F20.Sample" "Area.F1.Sample"
[34] "Area.F2.Sample" "Area.F3.Sample" "Area.F4.Sample"
[37] "Area.F6.Sample" "Area.F5.Sample" "Area.F7.Sample"
[40] "Area.F8.Sample" "Area.F9.Sample" "Area.F10.Sample"
[43] "Area.F11.Sample" "Area.F12.Sample" "Area.F13.Sample"
[46] "Area.F14.Sample" "Area.F15.Sample" "Area.F16.Sample"
[49] "Area.F17.Sample" "Area.F18.Sample" "Area.F19.Sample"
[52] "Area.F20.Sample" "XCorr.Sequest.HT" "Confidence.Sequest.HT"
[55] "Percolator.q.Value.Sequest.HT" "Percolator.PEP.Sequest.HT" "master_protein"
[58] "protein_length" "protein_description" "peptide_start"
[61] "peptide_end" "crap_protein" "associated_crap_protein"
[64] "unique" "filename"
peptides_quant <- makeMSNSet(obj=peptides_df, samples_inf=samples_inf, ab_col_ix=2, level="peptide", quant_name="Area")
tallies for missing data (# samples with missing)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
14 32 87 84 88 120 135 144 220 275 322 380 517 666 936 1234 1696 2528 4033 8276 4352
4352 peptides do not have any quantification values
So lots and lots of missing values! most peptides are quantified in only a very minor subset of fractions (<5/20). This is no suprise since we’re dealing with LFQ and subcellular fractionation here.
plotMissing(peptides_quant)
Out of 21787 total features, 21773 (99.936%) have missing values
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
32 87 84 88 120 135 144 220 275 322 380 517 666 936 1234 1696 2528 4033 8276
And 21741 features have more than one missing value
What about if we subset to GO annotated RBPs?
human_go <- readRDS("./Input/h_sapiens_go_full.rds")
GO_RBPs <- human_go %>% filter(GO.ID=="GO:0003723") %>% pull(UNIPROTKB)
peptides_quant[fData(peptides_quant)$master_protein %in% GO_RBPs,] %>% plotMissing()
Ok, so this doesn’t make any difference, 99.9% have a missing value!
Let’s aggregate to the unique peptide sequence level (ignorning variable modifications) and then see whether that reduces our problem
peptides_no_mod_quant <- agg_to_peptides(peptides_quant)
Your data contains missing values. Please read the relevant section in the combineFeatures manual page for
details the effects of missing values on data aggregation.
Nope, not really! We still have 19304 features (previously 21688) and 99.9% have missing values!
plotMissing(peptides_no_mod_quant)
Out of 19403 total features, 19389 (99.928%) have missing values
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
33 100 85 84 129 144 147 220 295 338 385 525 682 911 1202 1647 2397 3582 6483
And 19356 features have more than one missing value
OK, so we’re going to have to impute. Note that the missing valuesa are particularly present in the first 2 fractions and across fractions 3-7. After that we see fewer missing values. Also, remember that we have yet to identify any definite set of RNAs from the initial fractions (1-7ish). For this reason, we’ll remove these first 5 fractions for now and leave fraction 6 & 7 as these are useful to separate light membranes
plot(colSums(is.na(exprs(peptides_no_mod_quant))))
#peptides_no_mod_quant_no_lm <- peptides_no_mod_quant[,8:20]
peptides_no_mod_quant_no_lm <- peptides_no_mod_quant[,6:20]
plotMissing(peptides_no_mod_quant_no_lm)
Out of 19403 total features, 19258 (99.253%) have missing values
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
121 188 233 267 291 332 478 594 816 1063 1562 2381 3561 6592 779
And 19137 features have more than one missing value
png("./Output/plots/missing_values_peptide.png")
plotMissing(peptides_no_mod_quant_no_lm)
Out of 19403 total features, 19258 (99.253%) have missing values
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
121 188 233 267 291 332 478 594 816 1063 1562 2381 3561 6592 779
And 19137 features have more than one missing value
dev.off()
png
2
Ok, so now we’re down to just 99.1% missing :p but at least we have some more peptides now with relatively few (<=4 missing values)
If we focus on those peptides with 4-9 missing values, we can see that many are missing in a block of sequential fractions. Arguably, these are true missing values, e.g not at random.
missing_n <- rowSums(is.na(exprs(peptides_no_mod_quant_no_lm)))
peptides_no_mod_quant_no_lm[(missing_n>=4 & missing_n<=9),] %>% plotMissing()
Out of 2778 total features, 2778 (100%) have missing values
4 5 6 7 8 9
267 291 332 478 594 816
And 2778 features have more than one missing value
We can identify the missing values which are in a sequential block of >=5 fractions in a row and replace these with zero
First, let’s make a function to identify rows of values where the missing values are not random, e.g 4 or more in a row
test_values_1 <- c(1,1,NA,1,NA,NA,1,1,1,1)
test_values_2 <- c(1,1,NA,NA,NA,NA,NA,1,1,1)
test_values_3 <- c(NA,NA,NA,NA,1,1,1,1,1)
is_not_at_random <- function(x){
with(rle(is.na(x)), sum(lengths[values] >= 4))
}
is_not_at_random(test_values_1)
[1] 0
is_not_at_random(test_values_2)
[1] 1
is_not_at_random(test_values_3)
[1] 1
Now, let’s check this identifies the peptides which are not missing at random. First, we’ll remove those without at least 4 quantification values.
peptides_no_mod_quant_4 <- peptides_no_mod_quant_no_lm[missing_n<=(ncol(peptides_no_mod_quant_no_lm)-4),]
missing_not_at_random <- apply(exprs(peptides_no_mod_quant_4), 1, is_not_at_random)
peptides_no_mod_quant_4[missing_not_at_random==1,] %>% plotMissing()
Out of 3847 total features, 3847 (100%) have missing values
4 5 6 7 8 9 10 11
17 63 128 292 418 692 949 1288
And 3847 features have more than one missing value
peptides_no_mod_quant_4[missing_not_at_random==2,] %>% plotMissing()
Out of 379 total features, 379 (100%) have missing values
8 9 10 11
6 36 76 261
And 379 features have more than one missing value
Now, let’s extend the function to replace the blocks of missing values with zero
test_values_1 <- c(1,1,NA,1,NA,NA,1,1,1,1)
test_values_2 <- c(1,1,NA,NA,NA,NA,NA,1,1,1)
test_values_3 <- c(NA,NA,NA,NA,1,1,1,1,1, NA)
rle(is.na(test_values_1))$values
[1] FALSE TRUE FALSE TRUE FALSE
rle(is.na(test_values_1))$lengths
[1] 2 1 1 2 4
replace_missing_not_at_random <- function(x){
missing_rle <- rle(is.na(x))
sequential_blocks <- missing_rle$values
sequential_blocks[missing_rle$lengths<4] <- FALSE
replace_with_zero <- rep(sequential_blocks, missing_rle$lengths)
out <- x
out[replace_with_zero]<-0
return(out)
}
replace_missing_not_at_random(test_values_1)
[1] 1 1 NA 1 NA NA 1 1 1 1
replace_missing_not_at_random(test_values_2)
[1] 1 1 0 0 0 0 0 1 1 1
replace_missing_not_at_random(test_values_3)
[1] 0 0 0 0 1 1 1 1 1 NA
Below we impute a maximum of 10 missing values in sequential blocks with zeros
missing_n2 <- rowSums(is.na(exprs(peptides_no_mod_quant_4)))
peptides_no_mod_quant_4_mnar_zero <- peptides_no_mod_quant_4[missing_n2<=10,]
exprs(peptides_no_mod_quant_4_mnar_zero) <- exprs(peptides_no_mod_quant_4_mnar_zero) %>%
apply(1, replace_missing_not_at_random) %>% t()
Re-check the missing values
plotMissing(peptides_no_mod_quant_4)
Out of 6090 total features, 5945 (97.619%) have missing values
1 2 3 4 5 6 7 8 9 10 11
121 188 233 267 291 332 478 594 816 1063 1562
And 5824 features have more than one missing value
plotMissing(peptides_no_mod_quant_4_mnar_zero)
Out of 4528 total features, 3940 (87.014%) have missing values
1 2 3 4 5 6 7 8 9 10
639 792 740 586 415 286 186 170 88 38
And 3301 features have more than one missing value
peptides_no_mod_quant_4_mnar_zero_mar_knn <- peptides_no_mod_quant_4_mnar_zero
imputed_zeros <- rowSums(exprs(peptides_no_mod_quant_4_mnar_zero_mar_knn)==0, na.rm=TRUE)
missing_n3 <- rowSums(is.na(exprs(peptides_no_mod_quant_4_mnar_zero_mar_knn)))
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation <- imputed_zeros>0
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation_n <- imputed_zeros
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing <- missing_n3>0
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing_n <- missing_n3
peptides_no_mod_quant_4_mnar_zero_mar_knn <- peptides_no_mod_quant_4_mnar_zero_mar_knn[missing_n3<=3,]
print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation))
FALSE TRUE
687 2072
print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing))
FALSE TRUE
588 2171
print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation))
FALSE TRUE
FALSE 145 443
TRUE 542 1629
dim(peptides_no_mod_quant_4_mnar_zero)
[1] 4528 15
dim(peptides_no_mod_quant_4_mnar_zero_mar_knn)
[1] 2759 15
peptides_no_mod_quant_4_mnar_zero_mar_knn <- suppressMessages(impute(peptides_no_mod_quant_4_mnar_zero_mar_knn, "knn", k = 10))
Cluster size 2759 broken into 2532 227
Cluster size 2532 broken into 2182 350
Cluster size 2182 broken into 1982 200
Cluster size 1982 broken into 1432 550
Done cluster 1432
Done cluster 550
Done cluster 1982
Done cluster 200
Done cluster 2182
Done cluster 350
Done cluster 2532
Done cluster 227
p <- plotLabelQuant(peptides_no_mod_quant_no_lm, log=TRUE)
p <- plotLabelQuant(peptides_no_mod_quant_4_mnar_zero_mar_knn, log=TRUE)
p <- peptides_no_mod_quant_4_mnar_zero_mar_knn[
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,] %>% plotLabelQuant(log=TRUE)
p <- peptides_no_mod_quant_4_mnar_zero_mar_knn[
!fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,] %>% plotLabelQuant(log=TRUE)
protein_quant <- agg_to_protein(peptides_no_mod_quant_4_mnar_zero_mar_knn)
print(dim(protein_quant))
[1] 739 15
source("~/git_repos/CamProt_R/LOPIT.R")
markers_df <- read.delim("./Input//markers_9B_hyperLOPIT_vs_DC.csv", sep=",", header=FALSE, stringsAsFactors=FALSE)[,1:2]
markers_df$V2 <- recode(markers_df$V2, "RIBOSOME 40S"="RIBOSOME", "RIBOSOME 60S"="RIBOSOME")
markers_proteins <- markers_df$V2
names(markers_proteins) <- markers_df$V1
protein_quant_am <- addMarkers(normalise(protein_quant, "sum"), markers_proteins)
Markers in data: 123 out of 739
organelleMarkers
CYTOSOL ER GOLGI LYSOSOME MITOCHONDRIA NUCLEUS
8 21 4 2 5 16
NUCLEUS-CHROMATIN PEROXISOME PM PROTEASOME RIBOSOME unknown
6 1 18 3 39 616
p <- plotHexbin(protein_quant_am, "markers")
print(p)
marker_classes <- getMarkerClasses(protein_quant_am, "markers")
m_colours <- getColours(marker_classes)
p <- plotPCA(protein_quant_am, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_1_2_inc_glycoproteins.png")
Saving 7.29 x 4.5 in image
p <- plotPCA(protein_quant_am, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
the condition has length > 1 and only the first element will be used
print(p)
$p
glycoproteins <- read.delim("./Input/glycoproteins.tsv")$protein
Copied from Manasa’s oopsFinal.Rmd notebook
# We will add a bit more information to the fData file
# 1. List of known RBPs across cell lines in the XRNAX paper (Table S2)
xrnax = read.delim("./Input/xrnax-genelist.txt",sep="\t",header=T)
xrnax.rbps = xrnax %>%
dplyr::filter(!is.na(MCF7.RBP) | !is.na(HEK293.RBP) | !is.na(HeLa.RBP)) %>%
dplyr::select(Uniprot.ID:Protein.name)
rownames(xrnax.rbps) = xrnax.rbps$Uniprot.ID
print(length(rownames(xrnax.rbps)))
[1] 1753
# Check how many are common to the cell lines in the XRNAX paper
xrnax %>%
dplyr::select(MCF7.RBP:ihRBP) %>%
apply(2, table,useNA="always")
MCF7.RBP HEK293.RBP HeLa.RBP ihRBP
non-poly(A) 617 698 565 775
poly(A) 590 659 674 978
<NA> 1276 1126 1244 730
# 2. List of RBPs from SILAC experiments using OOPS after wash step2 (Table S1)
oops = read.delim("./Input/oops-genelist.txt",sep="\t",header=T)
oops.rbps = oops %>%
dplyr::filter(CL_NC_Ratio >= 1.0) %>%
dplyr::select(master_protein, RBP_glyco)
oops_rbps <- unique(oops.rbps$master_protein)
print(length(oops_rbps))
[1] 405
protein_quant_am_no_glyco <- protein_quant_am[!rownames(protein_quant_am) %in% glycoproteins,]
fData(protein_quant_am_no_glyco)$oops <- rownames(protein_quant_am_no_glyco) %in% oops_rbps
fData(protein_quant_am_no_glyco)$xrnax <- rownames(protein_quant_am_no_glyco) %in% rownames(xrnax.rbps)
fData(protein_quant_am_no_glyco)$go_rbp <- rownames(protein_quant_am_no_glyco) %in% GO_RBPs
print(table(fData(protein_quant_am_no_glyco)$oops, fData(protein_quant_am_no_glyco)$xrnax))
FALSE TRUE
FALSE 176 202
TRUE 21 239
print(table(fData(protein_quant_am_no_glyco)$oops, fData(protein_quant_am_no_glyco)$xrnax,
fData(protein_quant_am_no_glyco)$go_rbp))
, , = FALSE
FALSE TRUE
FALSE 149 104
TRUE 16 31
, , = TRUE
FALSE TRUE
FALSE 27 98
TRUE 5 208
print(dim(protein_quant_am))
[1] 739 15
print(dim(protein_quant_am_no_glyco))
[1] 638 15
marker_classes <- getMarkerClasses(protein_quant_am_no_glyco, "markers")
m_colours <- getColours(marker_classes)
p <- plotPCA(protein_quant_am_no_glyco, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_1_2.png")
Saving 7.29 x 4.5 in image
p <- plotPCA(protein_quant_am_no_glyco, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_3_4.png")
Saving 7.29 x 4.5 in image
protein_quant_am_no_glyco_yes_rbp <- protein_quant_am_no_glyco[
(fData(protein_quant_am_no_glyco)$oops |
fData(protein_quant_am_no_glyco)$xrnax |
fData(protein_quant_am_no_glyco)$go_rbp),]
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps.png")
Saving 7.29 x 4.5 in image
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_3_4_no_glyco_only_rbps.png")
Saving 7.29 x 4.5 in image
translocon <- human_go %>% filter(GO.ID=='GO:0071256') %>% pull(UNIPROTKB)
para <- human_go %>% filter(GO.ID=='GO:0042382') %>% pull(UNIPROTKB)
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, foi=translocon)
the condition has length > 1 and only the first element will be used
print(p)
$p
$p_foi
ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps_translocon.png")
Saving 7.29 x 4.5 in image
print(dim(protein_quant_am_no_glyco_yes_rbp))
[1] 489 15
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, foi=para)
the condition has length > 1 and only the first element will be used
print(p)
$p
$p_foi
ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps_paraspeckles.png")
Saving 7.29 x 4.5 in image
p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "markers", keep_markers=marker_classes, organelle_order=organelle_order) +
facet_wrap(~markers) +
scale_colour_manual(values=m_colours, guide=FALSE) +
theme(legend.position="none")
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.
print(p)
ggsave("./Output/plots/marker_profiles.png")
Saving 10 x 10 in image
p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "markers", keep_markers=marker_classes,
organelle_order=organelle_order, unknown=TRUE) +
facet_grid(zero_imputation_n~missing_n) +
scale_colour_manual(values=m_colours, guide=FALSE) +
theme(legend.position="none")
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.
print(p)
ggsave("./Output/plots/marker_profiles_imputation.png")
Saving 10 x 10 in image
Now we make a new set of markers designed to highlight functional groups of RBPs more usefully. First we need to define new sets of GO annotation proteins, where each marker belongs to only one group
translocon <- human_go %>% filter(GO.ID=='GO:0071256') %>% pull(UNIPROTKB)
para <- human_go %>% filter(GO.ID=='GO:0042382') %>% pull(UNIPROTKB)
mrna_splicing <- human_go %>% filter(GO.ID=='GO:0000398') %>% pull(UNIPROTKB)
translation_init <- human_go %>% filter(GO.ID=='GO:0006413') %>% pull(UNIPROTKB)
translation_init <- setdiff(translation_init, names(markers_proteins)[markers_proteins=="RIBOSOME"])
cell_cell_adhesion <- human_go %>% filter(GO.ID=='GO:0098609') %>% pull(UNIPROTKB)
cytoskeleton <- human_go %>% filter(GO.ID=='GO:0005856') %>% pull(UNIPROTKB)
motor_activity <- human_go %>% filter(GO.ID=='GO:0003774') %>% pull(UNIPROTKB)
er_stress_response <- human_go %>% filter(GO.ID=='GO:0030968') %>% pull(UNIPROTKB)
nuclear_pore_channel <- human_go %>% filter(GO.ID=='GO:0044613') %>% pull(UNIPROTKB)
nuclear_pore_basket <- human_go %>% filter(GO.ID=='GO:0044615') %>% pull(UNIPROTKB)
tRNA_AA <- human_go %>% filter(GO.ID=='GO:0004812') %>% pull(UNIPROTKB)
#mrna_splicing <- setdiff(mrna_splicing, c(para, translation_init, cell_cell_adhesion, cytoskeleton, motor_activity))
#translation_init <- setdiff(translation_init, c(para, cell_cell_adhesion, cytoskeleton, motor_activity))
#cytoskeleton <- setdiff(cytoskeleton, c(para, cell_cell_adhesion, motor_activity))
#cell_cell_adhesion <- setdiff(cell_cell_adhesion, c(para, motor_activity))
#all_markers <- c(mrna_splicing, para, translocon, translation_init, cell_cell_adhesion, cytoskeleton, motor_activity)
#print(length(all_markers)==length(unique(all_markers)))
mrna_splicing <- setdiff(mrna_splicing, tRNA_AA)
all_markers <- c(mrna_splicing, tRNA_AA)
print(length(all_markers)==length(unique(all_markers)))
[1] TRUE
rbps_markers <- markers_proteins
localisations_to_remove <- c("PEROXISOME", "PROTEASOME", "GOLGI", "LYSOSOME")
rbps_markers <- rbps_markers[!rbps_markers %in% localisations_to_remove]
rbps_markers[rbps_markers =="NUCLEUS-CHROMATIN"] <- "NUCLEUS"
print(table(rbps_markers))
rbps_markers
CYTOSOL ER MITOCHONDRIA NUCLEUS PM RIBOSOME
96 118 197 171 295 72
new_markers <- c(#rep("PARASPECKLES", length(para)),
rep("mRNA splicing", length(mrna_splicing)),
rep("Aminoacyl-tRNA ligase", length(tRNA_AA)),
"PARP1"#,
#rep("TRANSLATION INITITAION", length(translation_init)),
#rep("CELL-CELL ADHESION", length(cell_cell_adhesion)),
#rep("MOTOR", length(motor_activity))#,
#rep("CYTOSKELETON", length(cytoskeleton))
)
names(new_markers) <- c(#para,
mrna_splicing,
tRNA_AA,
"P09874"#,
#translation_init,
#cell_cell_adhesion,
#motor_activity#,
#cytoskeleton
)
print(table(names(new_markers))[table(names(new_markers))>1])
named integer(0)
rbps_markers <- rbps_markers[!names(rbps_markers) %in% names(new_markers)]
combined_markers <- c(rbps_markers, new_markers)
print(table(combined_markers))
combined_markers
Aminoacyl-tRNA ligase CYTOSOL ER MITOCHONDRIA mRNA splicing
42 93 118 193 321
NUCLEUS PARP1 PM RIBOSOME
149 1 295 72
print(table(names(combined_markers))[table(names(combined_markers))>1])
named integer(0)
fData(protein_quant_am_no_glyco_yes_rbp)$new_markers <- NULL
protein_quant_am_no_glyco_yes_rbp <- addMarkers(protein_quant_am_no_glyco_yes_rbp, combined_markers, "new_markers")
Markers in data: 168 out of 489
organelleMarkers
Aminoacyl-tRNA ligase CYTOSOL ER MITOCHONDRIA mRNA splicing
13 5 7 2 86
NUCLEUS PARP1 PM RIBOSOME unknown
12 1 3 39 321
fData(protein_quant_am_no_glyco_yes_rbp)$new_markers <- recode(
fData(protein_quant_am_no_glyco_yes_rbp)$new_markers, "NUCLEUS"="Nucleus", "RIBOSOME"="Ribosome", "CYTOSOL"="Cytosol",
"MITOCHONDRIA"="Mitochondria")
print(table(fData(protein_quant_am_no_glyco_yes_rbp)$new_markers))
Aminoacyl-tRNA ligase Cytosol ER Mitochondria mRNA splicing
13 5 7 2 86
Nucleus PARP1 PM Ribosome unknown
12 1 3 39 321
After adding these new RBP markers, we only have 1 PM and 2 Mt proteins remaining. Let’s remove the PM protein
fData(protein_quant_am_no_glyco_yes_rbp)$new_markers[fData(protein_quant_am_no_glyco_yes_rbp)$new_markers=="PM"] <- "unknown"
new_markers_levels <- getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers")
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "new_markers", re_order_markers=TRUE, marker_levels=new_markers_levels,
m_colours=getClassColours()[c(1:5, 7, 10:20)], foi=nuclear_pore_channel)
the condition has length > 1 and only the first element will be used
print(p)
$p
$p_foi
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "new_markers", re_order_markers=TRUE, marker_levels=new_markers_levels,
m_colours=getClassColours()[c(1:5, 7, 10:20)], foi=nuclear_pore_basket)
the condition has length > 1 and only the first element will be used
print(p)
$p
$p_foi
getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers")
[1] "Aminoacyl-tRNA ligase" "Cytosol" "ER" "Mitochondria"
[5] "mRNA splicing" "Nucleus" "PARP1" "Ribosome"
set.seed(1)
proj <- make_proj("t-SNE", protein_quant_am_no_glyco_yes_rbp, "new_markers")
Loading required namespace: Rtsne
library(Hmisc)
Loading required package: lattice
Loading required package: survival
Attaching package: ‘survival’
The following object is masked from ‘package:robustbase’:
heart
Loading required package: Formula
Attaching package: ‘Hmisc’
The following object is masked from ‘package:AnnotationDbi’:
contents
The following objects are masked from ‘package:MSnbase’:
impute, naplot
The following object is masked from ‘package:Biobase’:
contents
The following objects are masked from ‘package:dplyr’:
src, summarize
The following objects are masked from ‘package:base’:
format.pval, units
proj_df <- proj$PCA_df
marker_levels <- setdiff(new_markers_levels, "unknown")
marker_levels <- marker_levels[c(2,3,4,6,8,1,5,7)]
#m_colours <- getStockcol()[c(1,7,4,3,2,5)]
m_colours <- c(cbPalette[c(6,3,2,4,8,7,5)], "grey20")
proj_df$markers <- factor(proj_df$new_markers, levels=marker_levels)
print(table(is.na(proj_df$markers)))
FALSE TRUE
165 324
proj_df$unknown <- proj_df$new_markers=="unknown"
p <- ggplot(proj_df, aes(X, Y, colour=markers)) +
geom_point(aes(X, Y, colour=markers, alpha=unknown), size=2) +
scale_alpha_manual(values=c(1,0.2), guide=F) +
my_theme +
scale_colour_manual(values=m_colours, na.value="grey60", name="", breaks=c(marker_levels)) +
guides(colour = guide_legend(override.aes = list(alpha = 1, size=2))) +
xlab("Dimension 1") + ylab("Dimension 2")
print(p)
ggsave("./Output/tSNE_RBP_markers.png")
Saving 7.29 x 4.5 in image
write.table(proj_df, "./Output/tSNE_projections.tsv", sep="\t", quote=FALSE, row.name=FALSE)
p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "new_markers",
keep_markers=getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers"), organelle_order=new_markers_levels) +
facet_wrap(~markers) +
scale_colour_manual(values=getClassColours()[c(1:5, 7, 10:20)], guide=FALSE) +
theme(legend.position="none")
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.
print(p)
protein_info <- read.delim("Input/Aggregated-proteins-2187-with-uniprot.tab") %>%
dplyr::select(Entry, gene_name=Gene.names...primary..)
total.prot = readRDS("./Input/prot_res_20_fractions_imputed_markers.rds")
colnames(total.prot)[12] <- "0.948"
colnames(total.prot) <- c(seq(1,20,2), seq(2,20,2))
total.prot <- total.prot[,order(as.numeric(as.character(colnames(total.prot))))]
total.prot <- total.prot[,5:19]
total.prot = normalise(total.prot,"sum")
rbp.prot <- protein_quant_am_no_glyco_yes_rbp
colnames(rbp.prot) <- 5:19
print(dim(total.prot))
[1] 5610 15
print(dim(rbp.prot))
[1] 489 15
print(colnames(total.prot))
[1] "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19"
print(colnames(rbp.prot))
[1] "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19"
plotCombinedProfiles <- function(foi){
foi_in_total <- intersect(foi, rownames(total.prot))
foi_in_rbp <- intersect(foi, rownames(rbp.prot))
foi_in_both <- intersect(foi_in_rbp, foi_in_total)
print(foi_in_both)
if(length(foi_in_both)==0){
return(NA)
}
total_exprs_df <- exprs(total.prot[foi_in_both,])
total_exprs_df <- melt(total_exprs_df)
total_exprs_df$type = "All protein"
rbp_exprs_df <- exprs(rbp.prot[foi_in_both,])
rbp_exprs_df <- melt(rbp_exprs_df)
rbp_exprs_df$type = "RNA-bound"
#print(head(total_exprs_df))
#print(head(rbp_exprs_df))
#print(head(rbind(rbp_exprs_df, total_exprs_df)))
p <- rbind(rbp_exprs_df, total_exprs_df) %>%
merge(protein_info, by.x="Var1", by.y="Entry") %>%
ggplot(aes(Var2, value, colour=type, group=type)) +
my_theme + geom_line() +
theme(aspect.ratio=1, axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) +
xlab("Fraction") + ylab("Sumn normalised\nabundance") +
scale_colour_discrete(name="", na.value="grey")
if(length(foi_in_both)>1){
p <- p + facet_wrap(~gene_name)
}
print(p)
p2 <- p <- rbind(rbp_exprs_df, total_exprs_df) %>%
merge(protein_info, by.x="Var1", by.y="Entry") %>%
ggplot(aes(Var2, value, colour=type, group=interaction(type, gene_name))) +
my_theme + geom_line() +
theme(aspect.ratio=1, axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) +
xlab("Fraction") + ylab("Sumn normalised\nabundance") +
scale_colour_discrete(name="", na.value="grey")
print(p2)
invisible(list("p"=p, "p2"=p2))
}
p <- plotCombinedProfiles(para)
[1] "P23246" "Q15233" "P52272"
ggsave("Output/plots/paraspeckle.png", p$p2)
Saving 7.29 x 4.5 in image
plotCombinedProfiles(tRNA_AA)
[1] "Q9NSE4" "P41252" "P54577" "P49589" "P49591" "P14868" "O43776" "P07814" "P47897" "P26639" "P26640" "P49588"
[13] "P41250"
plotCombinedProfiles(motor_activity)
[1] "Q14203" "P33176" "O00159" "Q14204" "P35579" "P52732" "Q13409" "P35580" "P60660"
fvarLabels(rbp.prot)
[1] "Checked" "Confidence" "Sequence"
[4] "Modifications" "Qvality.PEP" "Qvality.q.value"
[7] "Number.of.Protein.Groups" "Number.of.Proteins" "Number.of.PSMs"
[10] "Master.Protein.Accessions" "Number.of.Missed.Cleavages" "Theo.MHplus.in.Da"
[13] "XCorr.Sequest.HT" "Confidence.Sequest.HT" "Percolator.q.Value.Sequest.HT"
[16] "Percolator.PEP.Sequest.HT" "master_protein" "protein_length"
[19] "protein_description" "peptide_start" "peptide_end"
[22] "crap_protein" "associated_crap_protein" "unique"
[25] "filename" "zero_imputation" "zero_imputation_n"
[28] "missing" "missing_n" "CV.F6"
[31] "CV.F7" "CV.F8" "CV.F9"
[34] "CV.F10" "CV.F11" "CV.F12"
[37] "CV.F13" "CV.F14" "CV.F15"
[40] "CV.F16" "CV.F17" "CV.F18"
[43] "CV.F19" "CV.F20" "markers"
[46] "oops" "xrnax" "go_rbp"
[49] "new_markers"
well_quantified_rbps <- fData(rbp.prot) %>%
filter(zero_imputation_n<=4, missing_n<=2) %>%
pull(master_protein)
plotCombinedProfiles(intersect(mrna_splicing, well_quantified_rbps))
[1] "Q01081" "Q00839" "P67809" "P11142" "O43290" "Q13247" "Q15366" "Q8IYB3"
ribosome_proteins <- names(markers_proteins)[markers_proteins=="RIBOSOME"]
plotCombinedProfiles(intersect(ribosome_proteins, well_quantified_rbps))
[1] "P15880" "P46781" "P35268" "P62750" "Q02543" "P36578" "P47914"
plotCombinedProfiles("Q15942")
[1] "Q15942"
p <- plotCombinedProfiles("P09874")
ggsave("./Output/PARP1_profiles.png", p$p2)